library(quanteda)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
# set working directory to the location of the master "BackslidingReplication" folder
setwd("")
library(quanteda)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
# set working directory to the location of the master "BackslidingReplication" folder
setwd("/Users/kronick/Riker/Replication/data")
# --------------------------------
# load data
# --------------------------------
# interview transcripts
hcf <- read.csv('data/marcelGranier/HCF-1998-10-18 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HCF')
hsr1 <- read.csv('data/marcelGranier/HSR-1997-05-19 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr2 <- read.csv('data/marcelGranier/HSR-1998-11-01 procesado.csv', sep = ';', skip = 10) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr3 <- read.csv('data/marcelGranier/HSR-1998-12-03 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
granier_data <- rbind(hcf, hsr1, hsr2, hsr3)
rm(hcf, hsr1, hsr2, hsr3)
# load coded NNs
nns_pobreza <- read_excel("data/dictionaryExpansion/nns_pobreza_all.xlsx")
library(quanteda)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
# set working directory to the location of the master "BackslidingReplication" folder
setwd("/Users/kronick/Riker/Replication/data")
# --------------------------------
# load data
# --------------------------------
# interview transcripts
hcf <- read.csv('data/marcelGranier/HCF-1998-10-18 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HCF')
hsr1 <- read.csv('data/marcelGranier/HSR-1997-05-19 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr2 <- read.csv('data/marcelGranier/HSR-1998-11-01 procesado.csv', sep = ';', skip = 10) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr3 <- read.csv('data/marcelGranier/HSR-1998-12-03 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
granier_data <- rbind(hcf, hsr1, hsr2, hsr3)
rm(hcf, hsr1, hsr2, hsr3)
# load coded NNs
nns_pobreza <- read_excel("data/dictionaryExpansion/nns_pobreza_all.xlsx")
library(readxl)
library(quanteda)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
# set working directory to the location of the master "BackslidingReplication" folder
setwd("/Users/kronick/Riker/Replication/data")
# --------------------------------
# load data
# --------------------------------
# interview transcripts
hcf <- read.csv('data/marcelGranier/HCF-1998-10-18 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HCF')
hsr1 <- read.csv('data/marcelGranier/HSR-1997-05-19 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr2 <- read.csv('data/marcelGranier/HSR-1998-11-01 procesado.csv', sep = ';', skip = 10) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr3 <- read.csv('data/marcelGranier/HSR-1998-12-03 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
granier_data <- rbind(hcf, hsr1, hsr2, hsr3)
rm(hcf, hsr1, hsr2, hsr3)
# load coded NNs
nns_pobreza <- read_excel("data/dictionaryExpansion/nns_pobreza_all.xlsx")
nns_constituyente <- read_excel("data/dictionaryExpansion/nns_constituyente_all.xlsx")
# --------------------------------
# pre-process corpus
# --------------------------------
granier_data$speaker[granier_data$speaker == "Hugo Chávez"] <- "Hugo Chavez"
granier_data <- granier_data %>% filter(text != '', speaker %in% c("Hugo Chavez", "Henrique Salas Romer"))
granier_data$text <- tolower(chartr("ãâàèìòùáéíóöúüûñÀÈÌÒÙÁÉÍÓÚÑ", "aaaeiouaeioouuunAEIOUAEIOUN", granier_data$text)) # remove accents
corpus_granier <- corpus(granier_data$text, docvars = data.frame(speaker = granier_data$speaker)) # quanteda corpus
# --------------------------------
# build dictionary
# --------------------------------
dict <- dictionary(list(constituyente = nns_constituyente$node, pobreza = nns_pobreza$node)) # top 20 terms
# --------------------------------
# count dictionary terms in corpus
# --------------------------------
dfm_granier <-  dfm_lookup(dfm(tokens(corpus_granier), tolower = TRUE), dictionary = dict)
dict_counts <- dfm_granier %>% convert("data.frame") %>%
mutate(num_tokens = unname(ntoken(granier_data$text)), speaker = granier_data$speaker, guest = granier_data$guest) %>%
group_by(speaker, guest) %>%
summarize(num_tokens = sum(num_tokens),
constituyente = sum(constituyente)/num_tokens,
pobreza = sum(pobreza)/num_tokens, .groups = 'drop_last') %>% select(-num_tokens) %>%
pivot_longer(cols = c(constituyente, pobreza), names_to = "theme")
# --------------------------------
# table 1
# --------------------------------
dict_counts
library(quanteda)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
# set working directory to the location of the master "BackslidingReplication" folder
setwd("/Users/kronick/Riker/Replication/data")
# --------------------------------
# load data
# --------------------------------
# interview transcripts
hcf <- read.csv('data/marcelGranier/HCF-1998-10-18 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HCF')
hsr1 <- read.csv('data/marcelGranier/HSR-1997-05-19 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr2 <- read.csv('data/marcelGranier/HSR-1998-11-01 procesado.csv', sep = ';', skip = 10) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
hsr3 <- read.csv('data/marcelGranier/HSR-1998-12-03 procesado.csv', sep = ';', skip = 11) %>% select(text = Transcript, speaker = Speaker) %>% mutate(guest = 'HSR')
granier_data <- rbind(hcf, hsr1, hsr2, hsr3)
rm(hcf, hsr1, hsr2, hsr3)
# load coded NNs
nns_pobreza <- read_excel("data/dictionaryExpansion/nns_pobreza_all.xlsx")
nns_constituyente <- read_excel("data/dictionaryExpansion/nns_constituyente_all.xlsx")
# --------------------------------
# pre-process corpus
# --------------------------------
granier_data$speaker[granier_data$speaker == "Hugo Chávez"] <- "Hugo Chavez"
granier_data <- granier_data %>% filter(text != '', speaker %in% c("Hugo Chavez", "Henrique Salas Romer"))
granier_data$text <- tolower(chartr("ãâàèìòùáéíóöúüûñÀÈÌÒÙÁÉÍÓÚÑ", "aaaeiouaeioouuunAEIOUAEIOUN", granier_data$text)) # remove accents
corpus_granier <- corpus(granier_data$text, docvars = data.frame(speaker = granier_data$speaker)) # quanteda corpus
# --------------------------------
# build dictionary
# --------------------------------
dict <- dictionary(list(constituyente = nns_constituyente$node, pobreza = nns_pobreza$node)) # top 20 terms
# --------------------------------
# count dictionary terms in corpus
# --------------------------------
dfm_granier <-  dfm_lookup(dfm(tokens(corpus_granier), tolower = TRUE), dictionary = dict)
dict_counts <- dfm_granier %>% convert("data.frame") %>%
mutate(num_tokens = unname(ntoken(granier_data$text)), speaker = granier_data$speaker, guest = granier_data$guest) %>%
group_by(speaker, guest) %>%
summarize(num_tokens = sum(num_tokens),
constituyente = sum(constituyente)/num_tokens,
pobreza = sum(pobreza)/num_tokens, .groups = 'drop_last') %>% select(-num_tokens) %>%
pivot_longer(cols = c(constituyente, pobreza), names_to = "theme")
# --------------------------------
# table 1
# --------------------------------
dict_counts
speaker              guest theme           value
write.csv(dict_counts, granier2.csv)
View(dict_counts)
View(dict_counts)
